import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
google=pd.read_csv(r"E:\Ankit Jain\D drive\Aviraj Personal File\IMS Analytics Class\Github sets\Google-App\Google-App.csv")
google.sample(7)
| App | Category | Rating | Reviews | Viewers | Installs | Type | Price | Content Rating | Genres | Last Updated | Current Ver | Android Ver | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 7706 | English Bite | FAILY | 3.5 | 268 | 9800000 | 10000 | Free | 0.0 | Everyone | Education | 25-Jan-16 | 1.3.0 | 4.0 and up |
| 5106 | B Physiotherapy Clinic | HEALTH_AND_FITNESS | 5.0 | 3 | 9300000 | 100 | Free | 0.0 | Everyone | Health & Fitness | 09-Oct-17 | 0.0.1 | 4.1 and up |
| 3389 | R Studio | SHOPPING | 3.7 | 23 | 8900000 | 5000 | Free | 0.0 | Everyone | Shopping | 08-Jul-16 | 2.1.1 | 4.0.3 and up |
| 6560 | CX-OF | FAILY | 4.2 | 18 | 37000000 | 1000 | Free | 0.0 | Everyone | Entertainent | 31-Jan-18 | 7.2.1 | 4.3 and up |
| 7088 | Guardian Hunter: SuperBrawlRPG | FAILY | 4.5 | 364013 | 27000000 | 1000000 | Free | 0.0 | Everyone 10+ | Role Playing | 06-Aug-18 | 14.4.4.00 | 4.1 and up |
| 2912 | C Offline Tutorial | BOOKS_AND_REFERENCE | 4.7 | 88 | 4200000 | 1000 | Free | 0.0 | Everyone | Books & Reference | 08-Dec-17 | 1.0.1 | 4.1 and up |
| 8088 | EP Horlogerie | LIFESTYLE | 4.9 | 18 | 24000000 | 1000 | Free | 0.0 | Everyone | Lifestyle | 28-Mar-18 | 4.0.2 | 4.0.3 and up |
google.describe()
| Rating | Reviews | Viewers | Installs | Price | |
|---|---|---|---|---|---|
| count | 7729.000000 | 9.145000e+03 | 9.145000e+03 | 9.145000e+03 | 9145.000000 |
| mean | 4.173852 | 2.490487e+05 | 2.305444e+07 | 7.114842e+06 | 1.184366 |
| std | 0.544563 | 1.716211e+06 | 2.331165e+07 | 4.619357e+07 | 17.355754 |
| min | 1.000000 | 0.000000e+00 | 8.500000e+05 | 0.000000e+00 | 0.000000 |
| 25% | 4.000000 | 2.200000e+01 | 5.500000e+06 | 1.000000e+03 | 0.000000 |
| 50% | 4.300000 | 7.420000e+02 | 1.400000e+07 | 1.000000e+05 | 0.000000 |
| 75% | 4.500000 | 2.503700e+04 | 3.200000e+07 | 1.000000e+06 | 0.000000 |
| max | 5.000000 | 4.489389e+07 | 1.020000e+08 | 1.000000e+09 | 400.000000 |
google.shape
(9145, 13)
google.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 9145 entries, 0 to 9144 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 App 9145 non-null object 1 Category 9145 non-null object 2 Rating 7729 non-null float64 3 Reviews 9145 non-null int64 4 Viewers 9145 non-null int64 5 Installs 9145 non-null int64 6 Type 9145 non-null object 7 Price 9145 non-null float64 8 Content Rating 9145 non-null object 9 Genres 9145 non-null object 10 Last Updated 9145 non-null object 11 Current Ver 9137 non-null object 12 Android Ver 9143 non-null object dtypes: float64(2), int64(3), object(8) memory usage: 928.9+ KB
type(google)
pandas.core.frame.DataFrame
google.isnull().sum()
App 0 Category 0 Rating 1416 Reviews 0 Viewers 0 Installs 0 Type 0 Price 0 Content Rating 0 Genres 0 Last Updated 0 Current Ver 8 Android Ver 2 dtype: int64
google["Rating"].value_counts()
4.4 835 4.5 822 4.3 818 4.2 740 4.6 646 4.1 594 4.0 481 4.7 421 3.9 333 3.8 272 5.0 269 4.8 216 3.7 198 3.6 158 3.5 153 3.4 115 3.3 98 4.9 86 3.0 74 3.1 62 3.2 60 2.9 43 2.8 42 2.6 24 2.7 23 2.3 20 2.5 19 2.4 19 1.0 16 2.2 14 1.9 12 2.0 11 1.7 8 2.1 8 1.8 8 1.6 4 1.4 3 1.5 3 1.2 1 Name: Rating, dtype: int64
google["Rating"].fillna(google["Rating"]==0,axis=0,inplace=True)
google.isnull().sum()
App 0 Category 0 Rating 0 Reviews 0 Viewers 0 Installs 0 Type 0 Price 0 Content Rating 0 Genres 0 Last Updated 0 Current Ver 8 Android Ver 2 dtype: int64
google["Current Ver"].mode(),google["Android Ver"].mode()
(0 1 dtype: object, 0 4.1 and up dtype: object)
google["Current Ver"].fillna(value=str(google["Current Ver"].mode()),inplace=True)
google["Android Ver"].fillna(value=str(google["Android Ver"].mode()),inplace=True)
google.isnull().sum()
App 0 Category 0 Rating 0 Reviews 0 Viewers 0 Installs 0 Type 0 Price 0 Content Rating 0 Genres 0 Last Updated 0 Current Ver 0 Android Ver 0 dtype: int64
google.plot(kind="box")
plt.show()
google["Type"].value_counts()
Free 8421 Paid 724 Name: Type, dtype: int64
sns.countplot(google["Type"])
<AxesSubplot:xlabel='Type', ylabel='count'>
sns.relplot(x="Rating",y="Reviews",data=google,hue="Type")
<seaborn.axisgrid.FacetGrid at 0x1ab41aa9b20>
sns.distplot(google["Viewers"])
<AxesSubplot:xlabel='Viewers', ylabel='Density'>
sns.distplot(google["Rating"])
<AxesSubplot:xlabel='Rating', ylabel='Density'>
google["Last Updated"]=pd.to_datetime(google["Last Updated"])
google["Year"]=google["Last Updated"].dt.year
pd.crosstab(google["Year"],google["Installs"],margins=True)
| Installs | 0 | 1 | 5 | 10 | 50 | 100 | 500 | 1000 | 5000 | 10000 | ... | 100000 | 500000 | 1000000 | 5000000 | 10000000 | 50000000 | 100000000 | 500000000 | 1000000000 | All |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Year | |||||||||||||||||||||
| 2010 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 2011 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | ... | 2 | 0 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 15 |
| 2012 | 0 | 0 | 0 | 1 | 2 | 1 | 0 | 2 | 0 | 4 | ... | 7 | 2 | 2 | 0 | 1 | 0 | 0 | 0 | 0 | 25 |
| 2013 | 0 | 0 | 1 | 1 | 5 | 11 | 4 | 13 | 9 | 23 | ... | 14 | 2 | 7 | 4 | 2 | 1 | 0 | 0 | 0 | 105 |
| 2014 | 0 | 1 | 1 | 7 | 5 | 14 | 3 | 27 | 9 | 38 | ... | 35 | 11 | 14 | 4 | 6 | 3 | 0 | 0 | 0 | 201 |
| 2015 | 1 | 0 | 2 | 21 | 11 | 46 | 22 | 53 | 37 | 57 | ... | 68 | 18 | 54 | 13 | 15 | 0 | 1 | 0 | 0 | 443 |
| 2016 | 1 | 2 | 4 | 27 | 12 | 77 | 50 | 105 | 49 | 126 | ... | 73 | 34 | 90 | 25 | 31 | 4 | 2 | 0 | 0 | 749 |
| 2017 | 6 | 13 | 11 | 80 | 46 | 185 | 88 | 238 | 115 | 245 | ... | 192 | 81 | 178 | 52 | 82 | 19 | 4 | 0 | 0 | 1738 |
| 2018 | 6 | 48 | 59 | 239 | 120 | 367 | 155 | 433 | 241 | 518 | ... | 662 | 344 | 956 | 434 | 688 | 120 | 194 | 30 | 10 | 5868 |
| All | 14 | 64 | 78 | 376 | 201 | 702 | 322 | 873 | 461 | 1011 | ... | 1054 | 492 | 1303 | 535 | 825 | 147 | 201 | 30 | 10 | 9145 |
10 rows × 21 columns
google["Rating"]=pd.to_numeric(google["Rating"])
pd.crosstab(google["Year"],google["Rating"])
| Rating | 0.0 | 1.0 | 1.2 | 1.4 | 1.5 | 1.6 | 1.7 | 1.8 | 1.9 | 2.0 | ... | 4.1 | 4.2 | 4.3 | 4.4 | 4.5 | 4.6 | 4.7 | 4.8 | 4.9 | 5.0 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Year | |||||||||||||||||||||
| 2010 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2011 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 3 | 0 | 0 | 1 | 2 | 1 | 0 | 0 | 0 | 0 |
| 2012 | 5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 1 | 3 | 4 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 2013 | 16 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 13 | 10 | 11 | 11 | 5 | 3 | 2 | 0 | 0 | 3 |
| 2014 | 18 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | ... | 18 | 21 | 17 | 15 | 10 | 14 | 3 | 4 | 1 | 4 |
| 2015 | 69 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | ... | 24 | 44 | 34 | 28 | 29 | 16 | 12 | 6 | 3 | 9 |
| 2016 | 149 | 3 | 0 | 1 | 0 | 0 | 0 | 2 | 2 | 1 | ... | 49 | 57 | 70 | 55 | 33 | 27 | 19 | 8 | 10 | 24 |
| 2017 | 353 | 4 | 0 | 1 | 0 | 1 | 4 | 2 | 3 | 6 | ... | 119 | 123 | 133 | 114 | 110 | 90 | 75 | 39 | 11 | 64 |
| 2018 | 806 | 7 | 1 | 1 | 3 | 2 | 3 | 3 | 6 | 4 | ... | 367 | 481 | 549 | 611 | 633 | 494 | 310 | 159 | 61 | 165 |
9 rows × 40 columns
pd.crosstab(google["Type"],google["Year"],margins=True)
| Year | 2010 | 2011 | 2012 | 2013 | 2014 | 2015 | 2016 | 2017 | 2018 | All |
|---|---|---|---|---|---|---|---|---|---|---|
| Type | ||||||||||
| Free | 1 | 12 | 20 | 77 | 144 | 365 | 660 | 1575 | 5567 | 8421 |
| Paid | 0 | 3 | 5 | 28 | 57 | 78 | 89 | 163 | 301 | 724 |
| All | 1 | 15 | 25 | 105 | 201 | 443 | 749 | 1738 | 5868 | 9145 |
google.isnull().sum()
App 0 Category 0 Rating 0 Reviews 0 Viewers 0 Installs 0 Type 0 Price 0 Content Rating 0 Genres 0 Last Updated 0 Current Ver 0 Android Ver 0 Year 0 dtype: int64
sns.barplot(x="Year",y="Installs",data=google)
<AxesSubplot:xlabel='Year', ylabel='Installs'>
round(google.groupby("Category")["Rating"].mean())
Category APS_AND_NAVIGATION 4.0 ART_AND_DESIGN 4.0 AUTO_AND_VEHICLES 3.0 BEAUTY 3.0 BOOKS_AND_REFERENCE 3.0 BUSINESS 3.0 COICS 4.0 COUNICATION 3.0 DATING 3.0 EDICAL 3.0 EDUCATION 4.0 ENTERTAINENT 4.0 EVENTS 3.0 FAILY 4.0 FINANCE 4.0 FOOD_AND_DRINK 3.0 GAE 4.0 HEALTH_AND_FITNESS 4.0 HOUSE_AND_HOE 3.0 LIBRARIES_AND_DEO 3.0 LIFESTYLE 3.0 NEWS_AND_AGAZINES 3.0 PARENTING 4.0 PERSONALIZATION 3.0 PHOTOGRAPHY 4.0 PRODUCTIVITY 3.0 SHOPPING 4.0 SOCIAL 4.0 SPORTS 3.0 TOOLS 3.0 TRAVEL_AND_LOCAL 3.0 VIDEO_PLAYERS 4.0 WEATHER 4.0 Name: Rating, dtype: float64
google["Content Rating"].unique()
array(['Everyone', 'Teen', 'Everyone 10+', 'Mature 17+',
'Adults only 18+', 'Unrated'], dtype=object)
pd.crosstab(google["Category"],google["Content Rating"],margins=True)
| Content Rating | Adults only 18+ | Everyone | Everyone 10+ | Mature 17+ | Teen | Unrated | All |
|---|---|---|---|---|---|---|---|
| Category | |||||||
| APS_AND_NAVIGATION | 0 | 104 | 1 | 1 | 2 | 0 | 108 |
| ART_AND_DESIGN | 0 | 58 | 1 | 0 | 3 | 0 | 62 |
| AUTO_AND_VEHICLES | 0 | 73 | 1 | 0 | 1 | 0 | 75 |
| BEAUTY | 0 | 39 | 2 | 3 | 3 | 0 | 47 |
| BOOKS_AND_REFERENCE | 0 | 182 | 5 | 2 | 8 | 0 | 197 |
| BUSINESS | 0 | 385 | 1 | 1 | 13 | 0 | 400 |
| COICS | 1 | 24 | 3 | 6 | 17 | 0 | 51 |
| COUNICATION | 0 | 232 | 0 | 6 | 27 | 0 | 265 |
| DATING | 0 | 21 | 0 | 179 | 12 | 0 | 212 |
| EDICAL | 0 | 414 | 8 | 7 | 5 | 0 | 434 |
| EDUCATION | 0 | 100 | 8 | 2 | 1 | 0 | 111 |
| ENTERTAINENT | 0 | 28 | 5 | 6 | 51 | 0 | 90 |
| EVENTS | 0 | 49 | 2 | 0 | 6 | 0 | 57 |
| FAILY | 0 | 1435 | 115 | 48 | 233 | 1 | 1832 |
| FINANCE | 0 | 302 | 0 | 0 | 5 | 0 | 307 |
| FOOD_AND_DRINK | 0 | 94 | 2 | 0 | 4 | 0 | 100 |
| GAE | 0 | 537 | 114 | 63 | 301 | 0 | 1015 |
| HEALTH_AND_FITNESS | 0 | 236 | 7 | 7 | 14 | 0 | 264 |
| HOUSE_AND_HOE | 0 | 66 | 0 | 0 | 2 | 0 | 68 |
| LIBRARIES_AND_DEO | 0 | 82 | 0 | 0 | 0 | 0 | 82 |
| LIFESTYLE | 0 | 313 | 4 | 5 | 23 | 0 | 345 |
| NEWS_AND_AGAZINES | 0 | 143 | 36 | 8 | 29 | 0 | 216 |
| PARENTING | 0 | 52 | 0 | 1 | 1 | 0 | 54 |
| PERSONALIZATION | 0 | 292 | 4 | 9 | 50 | 0 | 355 |
| PHOTOGRAPHY | 0 | 237 | 0 | 5 | 12 | 0 | 254 |
| PRODUCTIVITY | 0 | 294 | 1 | 1 | 7 | 0 | 303 |
| SHOPPING | 0 | 170 | 0 | 3 | 28 | 0 | 201 |
| SOCIAL | 0 | 82 | 1 | 36 | 91 | 0 | 210 |
| SPORTS | 1 | 286 | 9 | 1 | 14 | 0 | 311 |
| TOOLS | 0 | 731 | 0 | 2 | 5 | 1 | 739 |
| TRAVEL_AND_LOCAL | 0 | 188 | 0 | 1 | 3 | 0 | 192 |
| VIDEO_PLAYERS | 0 | 113 | 1 | 3 | 14 | 0 | 131 |
| WEATHER | 0 | 54 | 1 | 0 | 2 | 0 | 57 |
| All | 2 | 7416 | 332 | 406 | 987 | 2 | 9145 |
google.head()
| App | Category | Rating | Reviews | Viewers | Installs | Type | Price | Content Rating | Genres | Last Updated | Current Ver | Android Ver | Year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Photo Editor & Candy Caera & Grid & ScrapBook | ART_AND_DESIGN | 4.1 | 159 | 19000000 | 10000 | Free | 0.0 | Everyone | Art & Design | 2018-01-07 | 1.0.0 | 4.0.3 and up | 2018 |
| 1 | Coloring book oana | ART_AND_DESIGN | 3.9 | 967 | 14000000 | 500000 | Free | 0.0 | Everyone | Art & Design;Pretend Play | 2018-01-15 | 2.0.0 | 4.0.3 and up | 2018 |
| 2 | U Launcher Lite – FREE Live Cool Thees, Hide Apps | ART_AND_DESIGN | 4.7 | 87510 | 8700000 | 5000000 | Free | 0.0 | Everyone | Art & Design | 2018-08-01 | 1.2.4 | 4.0.3 and up | 2018 |
| 3 | Sketch - Draw & Paint | ART_AND_DESIGN | 4.5 | 215644 | 25000000 | 50000000 | Free | 0.0 | Teen | Art & Design | 2018-06-08 | Varies with device | 4.2 and up | 2018 |
| 4 | Pixel Draw - Nuber Art Coloring Book | ART_AND_DESIGN | 4.3 | 967 | 2800000 | 100000 | Free | 0.0 | Everyone | Art & Design;Creativity | 2018-06-20 | 1.1 | 4.4 and up | 2018 |
google.pivot_table(values="Installs",index=["Year","Type"],columns="Content Rating")
| Content Rating | Adults only 18+ | Everyone | Everyone 10+ | Mature 17+ | Teen | Unrated | |
|---|---|---|---|---|---|---|---|
| Year | Type | ||||||
| 2010 | Free | NaN | 1.000000e+05 | NaN | NaN | NaN | NaN |
| 2011 | Free | NaN | 7.401000e+05 | 5.000000e+06 | NaN | 5.000000e+06 | NaN |
| Paid | NaN | 2.033333e+03 | NaN | NaN | NaN | NaN | |
| 2012 | Free | NaN | 7.628889e+05 | NaN | 1.000000e+05 | NaN | 50000.0 |
| Paid | NaN | 2.042000e+03 | NaN | NaN | NaN | NaN | |
| 2013 | Free | NaN | 1.137140e+06 | NaN | 3.000000e+06 | 2.503667e+06 | NaN |
| Paid | NaN | 2.588978e+04 | 5.500000e+03 | NaN | 8.333333e+01 | NaN | |
| 2014 | Free | NaN | 1.184214e+06 | 5.333333e+04 | 3.516667e+06 | 7.643333e+06 | NaN |
| Paid | NaN | 2.592012e+04 | 1.000000e+04 | 5.000000e+04 | 3.200200e+05 | NaN | |
| 2015 | Free | NaN | 1.047906e+06 | 1.207188e+06 | 5.252500e+05 | 1.115855e+06 | 500.0 |
| Paid | NaN | 8.122537e+03 | 4.000000e+04 | 5.000500e+05 | 6.835000e+04 | NaN | |
| 2016 | Free | NaN | 1.330096e+06 | 7.060106e+06 | 1.291163e+06 | 7.382454e+05 | NaN |
| Paid | NaN | 8.894721e+04 | 1.792500e+05 | 1.400250e+04 | 1.537500e+05 | NaN | |
| 2017 | Free | NaN | 1.646808e+06 | 2.087488e+06 | 8.382364e+05 | 2.260488e+06 | NaN |
| Paid | NaN | 1.327061e+04 | 3.942857e+04 | 3.340000e+04 | 2.132500e+04 | NaN | |
| 2018 | Free | 750000.0 | 8.615688e+06 | 4.344189e+07 | 4.003188e+06 | 1.833282e+07 | NaN |
| Paid | NaN | 5.885069e+04 | 2.912000e+05 | 1.684202e+06 | 1.173143e+05 | NaN |
google["Rating"].value_counts().sort_index()
0.0 1416 1.0 16 1.2 1 1.4 3 1.5 3 1.6 4 1.7 8 1.8 8 1.9 12 2.0 11 2.1 8 2.2 14 2.3 20 2.4 19 2.5 19 2.6 24 2.7 23 2.8 42 2.9 43 3.0 74 3.1 62 3.2 60 3.3 98 3.4 115 3.5 153 3.6 158 3.7 198 3.8 272 3.9 333 4.0 481 4.1 594 4.2 740 4.3 818 4.4 835 4.5 822 4.6 646 4.7 421 4.8 216 4.9 86 5.0 269 Name: Rating, dtype: int64
import plotly.express as px
px.scatter(google, x="Viewers",y="Installs")
sns.pairplot(google)
<seaborn.axisgrid.PairGrid at 0x1ab43d3aa90>
sns.heatmap(google.corr(),annot=True)
<AxesSubplot:>
google.head()
| App | Category | Rating | Reviews | Viewers | Installs | Type | Price | Content Rating | Genres | Last Updated | Current Ver | Android Ver | Year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Photo Editor & Candy Caera & Grid & ScrapBook | ART_AND_DESIGN | 4.1 | 159 | 19000000 | 10000 | Free | 0.0 | Everyone | Art & Design | 2018-01-07 | 1.0.0 | 4.0.3 and up | 2018 |
| 1 | Coloring book oana | ART_AND_DESIGN | 3.9 | 967 | 14000000 | 500000 | Free | 0.0 | Everyone | Art & Design;Pretend Play | 2018-01-15 | 2.0.0 | 4.0.3 and up | 2018 |
| 2 | U Launcher Lite – FREE Live Cool Thees, Hide Apps | ART_AND_DESIGN | 4.7 | 87510 | 8700000 | 5000000 | Free | 0.0 | Everyone | Art & Design | 2018-08-01 | 1.2.4 | 4.0.3 and up | 2018 |
| 3 | Sketch - Draw & Paint | ART_AND_DESIGN | 4.5 | 215644 | 25000000 | 50000000 | Free | 0.0 | Teen | Art & Design | 2018-06-08 | Varies with device | 4.2 and up | 2018 |
| 4 | Pixel Draw - Nuber Art Coloring Book | ART_AND_DESIGN | 4.3 | 967 | 2800000 | 100000 | Free | 0.0 | Everyone | Art & Design;Creativity | 2018-06-20 | 1.1 | 4.4 and up | 2018 |
google.groupby("Year")["Rating","Reviews","Viewers","Installs"].sum()
| Rating | Reviews | Viewers | Installs | |
|---|---|---|---|---|
| Year | ||||
| 2010 | 4.2 | 1415 | 20900000 | 100000 |
| 2011 | 59.5 | 235800 | 213200000 | 17407100 |
| 2012 | 75.2 | 267477 | 587500000 | 13892210 |
| 2013 | 363.3 | 3518580 | 2883900000 | 100091365 |
| 2014 | 738.4 | 6368435 | 4303600000 | 254605226 |
| 2015 | 1517.8 | 12352311 | 6649000000 | 386824370 |
| 2016 | 2419.7 | 19636854 | 13668600000 | 952793592 |
| 2017 | 5663.7 | 70902122 | 34682950000 | 2676178668 |
| 2018 | 21417.9 | 2164267392 | 147823200000 | 60663340933 |
import plotly.express as px
px.scatter(google,x="Reviews",y="Rating")
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
lin=LinearRegression()
x=google.iloc[: ,[2,3,4,5]].values
y=google.iloc[: ,-1].values
print(x),print(y)
[[4.10000e+00 1.59000e+02 1.90000e+07 1.00000e+04] [3.90000e+00 9.67000e+02 1.40000e+07 5.00000e+05] [4.70000e+00 8.75100e+04 8.70000e+06 5.00000e+06] ... [5.00000e+00 4.00000e+00 3.60000e+06 1.00000e+02] [0.00000e+00 3.00000e+00 9.50000e+06 1.00000e+03] [4.50000e+00 3.98307e+05 1.90000e+07 1.00000e+07]] [2018 2018 2018 ... 2018 2017 2018]
(None, None)
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=.30,random_state=3)
xtrain,xtest,ytrain,ytest
(array([[5.0000e+00, 4.0000e+00, 1.5000e+07, 5.0000e+01],
[4.4000e+00, 2.5580e+03, 6.1000e+07, 1.0000e+05],
[4.0000e+00, 1.2781e+05, 3.3000e+07, 1.0000e+07],
...,
[3.7000e+00, 6.0000e+00, 7.8000e+06, 1.0000e+03],
[3.8000e+00, 3.6028e+04, 5.0000e+07, 5.0000e+06],
[3.8000e+00, 1.2414e+04, 1.3000e+07, 1.0000e+06]]),
array([[3.90000e+00, 4.78000e+02, 2.00000e+06, 1.00000e+05],
[4.00000e+00, 1.07740e+04, 3.50000e+07, 1.00000e+05],
[4.10000e+00, 1.55649e+05, 3.80000e+07, 1.00000e+07],
...,
[3.80000e+00, 4.11600e+03, 8.80000e+06, 1.00000e+05],
[5.00000e+00, 7.00000e+00, 6.80000e+07, 1.00000e+02],
[4.20000e+00, 1.25720e+04, 2.40000e+07, 1.00000e+06]]),
array([2018, 2018, 2018, ..., 2018, 2018, 2017], dtype=int64),
array([2016, 2018, 2018, ..., 2015, 2018, 2018], dtype=int64))
lin.fit(xtrain,ytrain)
LinearRegression()
ypred=lin.predict(xtest).round()
ytest[0:5],ypred[0:5]
(array([2016, 2018, 2018, 2018, 2018], dtype=int64), array([2017., 2017., 2017., 2017., 2017.]))
from sklearn import metrics
print(round(metrics.mean_squared_error(ytest,ypred)*100))
print(round(metrics.median_absolute_error(ytest,ypred)*100))
print(round(np.sqrt(metrics.mean_squared_error(ytest,ypred)*100)))
141 100 12
xtrain.shape,xtest.shape,ytrain.shape,ytest.shape
((6401, 4), (2744, 4), (6401,), (2744,))
print(lin.coef_)
print(lin.intercept_)
[3.43659931e-02 1.72588209e-08 3.28762074e-09 1.25594595e-09] 2017.1226334283845
lin.score(xtest,ytest)
0.015407574596416729